# encoding: utf-8
import re

def tableParser(table):
	trList = re.findall("<tr.*?</tr>",table)
	tableDict = {}         # 被解析的表格
	for i in range(0,len(trList)):
		tableDict[i] = {}
	startRow = 0
	for i in range(0,len(trList)):
		tdList = re.findall("<td.*?</td>",trList[i])
		startCol = 0
		for j in range(0,len(tdList)):
			occupyRow = int(re.findall('rowspan="(.*?)"',tdList[j])[0]) if len(re.findall('rowspan="(.*?)"',tdList[j])) > 0 else 1
			occupyCol = int(re.findall('colspan="(.*?)"',tdList[j])[0]) if len(re.findall('colspan="(.*?)"',tdList[j])) > 0 else 1
			tdText = re.sub('<td.*?>|</td>|<p.*?>|</p>|<span .*?>|</span>|<br/>',"",tdList[j])
			while startCol in tableDict[startRow] and tableDict[startRow][startCol] == "None":
				startCol += 1

			tableDict[startRow][startCol] = tdText
			for k in range(1,occupyRow):
				tableDict[startRow+k][startCol] = "None"
			for k in range(1,occupyCol):
				tableDict[startRow][startCol+k] = "None"
				for m in range(1,occupyRow):
					tableDict[startRow+m][startCol+k] = "None"
			startCol += occupyCol
		startRow += 1

	print(tableDict)

def extractTable(filePath,outDir):
	fileObject = open(filePath, 'r',encoding='utf-8')
	try:
		htmlFileContent = fileObject.read()
	finally:
		fileObject.close()
	# 1.biao ge ti qu
	tableList = re.findall('<table.*?/table>', htmlFileContent)
	newTableList = []
	if len(tableList) > 0:
		table = tableList[0][:-8]
		for i in range(0, len(tableList) - 1):
			if htmlFileContent.find(tableList[i]+'<p style="text-indent: 0pt;text-align: left;"><br/></p>'+tableList[i + 1]) != -1:
				table += tableList[i+1][len(re.findall('<table.*?>', tableList[i+1])[0]):-8]
			else:
				newTableList.append(table + "</table>")
				table = tableList[i + 1][:-8]
		newTableList.append(table + "</table>")
	del tableList
	# 2.shu ju ti qu
	if len(newTableList) > 0:
		for i in range(0, len(newTableList)):
			print(i)
			tableParser(newTableList[i])